// STL
#include <algorithm>                // std::sort(), ::max()
#include <cmath>                    // std::sqrt()
#include <cstdlib>                  // std::abs()
#include <utility>                  // std::pair<>, ::make_pair()
#include <vector>                   // std::vector<>

// stats++
#include "statsxx/distribution.hpp" // distribution::Q_KS


//
// DESC: Calculate the Kolmogorov--Smirnov statistic D and the p-value for the null hypothesis that the data sets are drawn from the same distribution.
//
// NOTE: Small values of p show that the cumulative distribution functions of the two data sets are significantly different.
//
// -----
//
// NOTE: The K--S statistic is intended for use with a continuous distribution.
//
// NOTE: ... It can also be used for a discrete distribution, though.
//
// NOTE: ... In this case, the test is conservative (the statistic is no larger than in the continuous case).
//
// NOTE: This consideration/effect becomes more important/significant with the number of ties (see below).
//
// -----
//
// NOTE: Ties (because of a discrete distribution) are handled in the standard way.
//
// -----
//
// NOTE: See "Numerical Recipes, 3rd Ed." Section 14.3.3.
//
inline std::pair<
              double, // D
              double  // p-value
              > distribution::KS_two(
                                     std::vector<double> data_1,
                                     std::vector<double> data_2
                                     )
{
    double D;
    double p;

    std::sort(data_1.begin(), data_1.end());
    std::sort(data_2.begin(), data_2.end());

    int j_1 = 0;
    int j_2 = 0;

    double fn_1 = 0.;
    double fn_2 = 0.;

    D = 0.;

    while( (j_1 < data_1.size()) && (j_2 < data_2.size()) ) // while we are not done ...
    {
        double d_1;
        double d_2;

        // next step is in data_1
        if( (d_1 = data_1[j_1]) <= (d_2 = data_2[j_2]) )
        {
            do
            {
                fn_1 = ++j_1/static_cast<double>(data_1.size());
            } while( (j_1 < data_1.size()) && (d_1 == data_1[j_1]) );
        }

        // next step is in data_2
        if( d_2 <= d_1 )
        {
            do
            {
                fn_2 = ++j_2/static_cast<double>(data_2.size());
            } while( (j_2 < data_2.size()) && (d_2 == data_2[j_2]) );
        }

        D = std::max(D, std::abs((fn_2 - fn_1)));
    }

    // CALCULATE THE EFFECTIVE (e) NUMBER OF DATA POINTS
    //
    // NOTE: Eq. (14.3.19)
    double N_e = static_cast<double>(data_1.size()*data_2.size())/(data_1.size() + data_2.size());

    // CALCULATE p-VALUE
    //
    // NOTE: Eq. (14.3.18)
    //
    // NOTE: This formula is an approximation to that the p-value of an observed value of D (as a disproof of the null hypothesis that the distributions are the same).
    //
    // NOTE: ... The nature of this approximation is that it becomes asymptotically accurate as the N_e becomes large.
    p = distribution::Q_KS( ((std::sqrt(N_e) + 0.12 + 0.11/std::sqrt(N_e))*D) );

    // RETURN
    return std::make_pair(
                          D,
                          p
                          );
}
